NLTK - Sentiment analysis

by: sid93, 7 years ago



import nltk
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize



class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifier = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

short_pos = open("positive.txt", "r").read()
short_neg = open("negative.txt", "r").read()

all_words = []
documents = []

#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

for p in short_pos.split('n'):
    documents.append((p, "pos"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())


for p in short_neg.split('n'):
    documents.append((p, "neg"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())
    
save_documents = open("documents.pickle", "wb")
pickle.dump(documents, save_documents)
save_documents.close()

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

save_word_features = open("word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()


def find_features(document):
    words = word_tokenize(document)
    feature = {}
    for w in word_features:
        feature[w] = (w in words)

    return feature

featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)
print(len(featuresets))

testing_set = featuresets[10000:]
training_set = featuresets[:10000]


classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

###############
save_classifier = open("originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)



getting the following error at MNB_classifier.train: -

Traceback (most recent call last):
  File "D:Userssiddharth.uDocumentsNatural Language ProcessingCreating a module for Sentiment Analysis with NLTKCreating a module for Sentiment Analysis with NLTK.py", line 103, in <module>
    MNB_classifier.train(training_set)
  File "D:Userssiddharth.uAppDataLocalContinuumAnaconda3libsite-packagesnltkclassifyscikitlearn.py", line 117, in train
    X = self._vectorizer.fit_transform(X)
  File "D:Userssiddharth.uAppDataLocalContinuumAnaconda3libsite-packagessklearnfeature_extractiondict_vectorizer.py", line 230, in fit_transform
    return self._transform(X, fitting=True)
  File "D:Userssiddharth.uAppDataLocalContinuumAnaconda3libsite-packagessklearnfeature_extractiondict_vectorizer.py", line 171, in _transform
    indices.append(vocab[f])
MemoryError



You must be logged in to post. Please login or register an account.



The MemoryError indicates, that you are either running out of ram or you are using the 32 Bit version of python and are hitting the 2gb limit of the process.
As long as the 32 Bit Python is the limiting factor, you can just switch to the 64 Bit version and should be fine.
If you are running out of actual ram, it's a different story.

-Tmesus 7 years ago

You must be logged in to post. Please login or register an account.